SatRday Chicago, April 27, 2019

Parfait Gasana

Data Analyst, Winston & Strawn
  @ParfaitG (GitHub)




R as an Extendable Environment



Relational Databases



Advantages of Relational Databases in Data Science



Programming Interfaces



R DBI Standard



Use Cases



CTA Bus/Rail Aggregation

library(DBI)
library(RPostgreSQL)

# CONNECT
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host=config$pg_conn$host, dbname=config$pg_conn$name,
                  user=config$pg_conn$user, password=config$pg_conn$pwd, port=config$pg_conn$port)
# R AGGREGATION (MULTIPLE FUNCS)
# do.call(data.frame, 
#        aggregate(rides ~ route + routename, agg_csv,
#                  function(x) c(count=length(x), sum=sum(x), mean=mean(x),
#                                median=median(x), min=min(x), max=max(x))))

# POSTGRES AGGREGATION
sql <- 'SELECT rt.route_name, 
               COUNT(rd.rides) AS "count", 
               SUM(rd.rides) AS "sum", 
               AVG(rd.rides) AS "mean", 
               MEDIAN(rd.rides) AS "median",
               R_MEDIAN(rd.rides) AS "r_median",
               MIN(rd.rides) AS "min", 
               MAX(rd.rides) AS "max"
        FROM bus_routes rt
        INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
        GROUP BY rt.route_name
        ORDER BY SUM(rd.rides) DESC
        LIMIT 10'

agg_sql <- dbGetQuery(conn, sql)

knitr::kable(agg_sql)
route_name count sum mean median r_median min max
79th 6513 176238493 27059.50 27672 27672 7896 43081
Ashland 6513 156178845 23979.56 23676 23676 5570 45177
Chicago 6513 130620291 20055.32 21767 21767 4448 35893
Western 6513 129322319 19856.03 19553 19553 4985 37202
Cottage Grove 6513 128999766 19806.50 21181 21181 5489 31187
Belmont 6513 123065617 18895.38 20732 20732 3451 30025
Clark 6513 119459342 18341.68 19171 19171 3310 26896
King Drive 6513 119070638 18282.00 19402 19402 3993 29896
Halsted 6513 118284951 18161.36 19305 19305 2915 30605
Sheridan 6513 118178375 18145.00 18812 18812 2666 33236

Graphing

ggplot(agg_sql, aes(route_name, sum, fill=route_name)) + geom_col(position = "dodge") +
  labs(title="CTA Top 10 Bus Routes by Ridership", x="Year", y="Rides") +
  scale_y_continuous(expand = c(0, 0), label=comma, limit=c(0,2E8)) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Calculation + Aggregation

sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year", m.rail_line, SUM(m.rail_total)  AS rail_total
      FROM merge_rail m
      GROUP BY m."year", m.rail_line
      ORDER BY m.rail_line, m."year"'
  
agg_sql <- dbGetQuery(conn, sql)

knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
year rail_line rail_total
56 2002 orange 11050080
112 2004 purple_exp 10812035
1 2001 blue 33723000
136 2010 red 61875674
88 2016 pink 10508724
113 2005 purple_exp 11174084
27 2009 brown 17423540
26 2008 brown 15867835
15 2015 blue 47754470
86 2014 pink 10363334
23 2005 brown 15526814
13 2013 blue 45304180
20 2002 brown 15513076
80 2008 pink 8965105
131 2005 red 53082658
52 2016 green 14596581
145 2001 yellow 1175412
126 2018 purple_exp 12506703
111 2003 purple_exp 11049890
65 2011 orange 13065864

Graphing

cta_palette <- c(blue="#00A1DE", brown="#62361B", green="#009B3A", orange="#F9461C", pink="#E27EA6",
                 purple="#522398", purple_exp="#8059BA", red="#C60C30", yellow="#F9E300")

ggplot(agg_sql, aes(year, rail_total, color=rail_line)) + 
  geom_line(stat="identity") + geom_point(stat="identity") +
  labs(title="CTA Rail Ridership By Year", x="Year", y="Rides") +
  scale_x_continuous("year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_color_manual(values = cta_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Modeling

CREATE MATERIALIZED VIEW Rail_Model_Data AS
    SELECT r.id, r.station_id, r.station_name, r.ride_date, r.day_type, r.rides AS raw, 
          (r.rides / COUNT(*) OVER(PARTITION BY r.station_id, r.ride_date)) AS rides,
          CASE 
               WHEN r.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN r.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN r.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN r.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN r.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,        
           REPLACE(REPLACE((regexp_split_to_array(s.location, '\s+'))[1], ',', ''), '(', '')::numeric AS latitude,
           REPLACE((regexp_split_to_array(s.location, '\s+'))[2], ')', '')::numeric AS longitude,
           s.rail_line, s.ada, s.direction,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
       (
        SELECT id, station_id, station_name, day_type, rides, ride_date, 
               ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
        FROM rail_rides
       )r
    INNER JOIN rail_stations s ON s.station_id = r.station_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = r.ride_date
    INNER JOIN gas_prices g ON g.gas_date = r.ride_date
    INNER JOIN weather_data w ON w.weather_date = r.ride_date
    ORDER BY r.ride_date, r.station_id;
    
REFRESH MATERIALIZED VIEW Rail_Model_Data;
rail_model_data <- dbGetQuery(conn, "SELECT * FROM rail_model_data")

head(rail_model_data)
model <- lm(rides ~ day_type + season + latitude + longitude + rail_line + 
                    ue_rate + gas_price + avg_temp + precipitation + snow_depth, 
            data = rail_model_data)

Analysis of Variance

Df Sum Sq Mean Sq F value Pr(>F)
day_type 2 445663599687.39 222831799843.70 64679.23 0.0000
season 3 12065643417.40 4021881139.13 1167.39 0.0000
latitude 1 86867071786.38 86867071786.38 25214.06 0.0000
longitude 1 319342686.35 319342686.35 92.69 0.0000
rail_line 8 1832581552014.08 229072694001.76 66490.71 0.0000
ue_rate 1 1655357961.89 1655357961.89 480.48 0.0000
gas_price 1 22101747908.78 22101747908.78 6415.26 0.0000
avg_temp 1 3204630785.23 3204630785.23 930.18 0.0000
precipitation 1 878964700.83 878964700.83 255.13 0.0000
snow_depth 1 8106971.91 8106971.91 2.35 0.1250
Residuals 1038214 3576837505171.89 3445183.27


Point Estimates

Estimate Std. Error t value Pr(>|t|)
(Intercept) -67333.2410 3539.2418 -19.02 0.0000
day_typeU -454.9352 6.6213 -68.71 0.0000
day_typeW 1159.1548 5.2660 220.12 0.0000
seasonspring -180.2668 5.3604 -33.63 0.0000
seasonsummer -179.9820 6.5945 -27.29 0.0000
seasonwinter -160.1206 6.1885 -25.87 0.0000
latitude -3954.6687 36.5219 -108.28 0.0000
longitude -2679.7920 43.6084 -61.45 0.0000
rail_linebrown -1187.7668 7.2026 -164.91 0.0000
rail_linegreen -2076.7759 6.8344 -303.87 0.0000
rail_lineorange -1082.7685 8.2805 -130.76 0.0000
rail_linepink -2215.7928 7.3581 -301.13 0.0000
rail_linepurple -2043.6363 11.1396 -183.46 0.0000
rail_linepurple_exp -1558.0431 7.6564 -203.50 0.0000
rail_linered 1636.6617 6.9037 237.07 0.0000
rail_lineyellow -1418.1369 17.5558 -80.78 0.0000
ue_rate -7.6397 0.9665 -7.90 0.0000
gas_price 188.1598 2.4375 77.19 0.0000
avg_temp 5.1418 0.1658 31.01 0.0000
precipitation -84.0921 5.2553 -16.00 0.0000
snow_depth 1.8487 1.2051 1.53 0.1250
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Rail Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-4000, 2000) + 
  scale_fill_manual(values = seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=45, vjust=0.95, hjust=0.95))

# DISCONNECT FROM DATABASE
dbDisconnect(conn)
## [1] TRUE


Divvy Time and Distance Calculation

options(connectionObserver = NULL)

library(odbc)

conn <- dbConnect(odbc::odbc(),
                  instance = config$db2_conn$instance,
                  driver = config$db2_conn$driver,
                  database = config$db2_conn$database,
                  server = config$db2_conn$server,
                  hostname = config$db2_conn$hostname,
                  port = config$db2_conn$port,
                  uid = config$db2_conn$uid,
                  pwd = config$db2_conn$pwd,
                  LongDataCompat = 1)
day_df <- dbGetQuery(conn, "SELECT t.START_TIME, t.STOP_TIME,
                                   t.TRIP_DURATION
                            FROM TRIPS t
                            WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD')
                            ORDER BY t.START_TIME")

knitr::kable(head(day_df, 20))
START_TIME STOP_TIME TRIP_DURATION
2015-09-23 00:00:00 2015-09-23 00:07:00 414
2015-09-23 00:00:00 2015-09-23 00:03:00 183
2015-09-23 00:01:00 2015-09-23 00:09:00 450
2015-09-23 00:01:00 2015-09-23 00:24:00 1350
2015-09-23 00:01:00 2015-09-23 00:07:00 402
2015-09-23 00:02:00 2015-09-23 00:15:00 793
2015-09-23 00:03:00 2015-09-23 00:27:00 1435
2015-09-23 00:04:00 2015-09-23 00:06:00 134
2015-09-23 00:05:00 2015-09-23 00:19:00 820
2015-09-23 00:06:00 2015-09-23 00:15:00 526
2015-09-23 00:07:00 2015-09-23 00:15:00 506
2015-09-23 00:08:00 2015-09-23 00:15:00 425
2015-09-23 00:09:00 2015-09-23 00:32:00 1372
2015-09-23 00:10:00 2015-09-23 00:24:00 881
2015-09-23 00:10:00 2015-09-23 00:35:00 1445
2015-09-23 00:10:00 2015-09-23 00:26:00 935
2015-09-23 00:11:00 2015-09-23 00:23:00 717
2015-09-23 00:13:00 2015-09-23 00:19:00 382
2015-09-23 00:14:00 2015-09-23 00:15:00 66
2015-09-23 00:16:00 2015-09-23 00:38:00 1316
ggplot(day_df, aes(START_TIME, TRIP_DURATION)) + geom_line(color=seaborn_palette[1]) +
  xlab("Start Time") + ylab("Trip Duration")

agg_sql <- dbGetQuery(conn, "WITH sub AS 
                                (SELECT t.FROM_STATION_ID, t.FROM_STATION_NAME,
                                        t.FROM_LATITUDE, t.FROM_LONGITUDE,
                                        ROUND(CASE 
                                                   WHEN t.TO_LATITUDE IS NOT NULL AND t.FROM_LATITUDE IS NOT NULL
                                                   THEN
                                                        3963.1 * (2 * ATAN(1) - 
                                                                  ASIN(SIN(RADIANS(t.FROM_LATITUDE)) * SIN(RADIANS(t.TO_LONGITUDE)) + 
                                                                       COS(RADIANS(t.FROM_LATITUDE)) * COS(RADIANS(t.TO_LONGITUDE)) * 
                                                                       COS(RADIANS(t.TO_LONGITUDE - t.FROM_LONGITUDE))
                                                                      ) 
                                                                 )
                                                   ELSE NULL
                                              END, 4) AS TRIP_DISTANCE
                                 FROM TRIPS t
                                 WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD'))
                            
                            SELECT FROM_STATION_ID, FROM_STATION_NAME, 
                                   FROM_LATITUDE, FROM_LONGITUDE,
                                   MIN(TRIP_DISTANCE) AS MIN_DISTANCE,
                                   MAX(TRIP_DISTANCE) AS MAX_DISTANCE
                            
                            FROM sub 
                            GROUP BY FROM_STATION_ID, FROM_STATION_NAME, 
                                     FROM_LATITUDE, FROM_LONGITUDE
                            ORDER BY MAX(TRIP_DISTANCE) DESC
                            FETCH FIRST 10 ROWS ONLY;")

knitr::kable(agg_sql)
FROM_STATION_ID FROM_STATION_NAME FROM_LATITUDE FROM_LONGITUDE MIN_DISTANCE MAX_DISTANCE
467 Western Ave & Lunt Ave 42.00859 -87.69049 8969.148 8971.314
494 Kedzie Ave & Bryn Mawr Ave 41.98240 -87.70892 8969.882 8971.135
447 Glenwood Ave & Morse Ave 42.00797 -87.66550 8967.513 8971.132
466 Ridge Blvd & Touhy Ave 42.01213 -87.68291 8969.393 8970.895
294 Broadway & Berwyn Ave 41.97835 -87.65975 8963.826 8970.855
479 Drake Ave & Montrose Ave 41.96115 -87.71657 8969.369 8970.804
469 St. Louis Ave & Balmoral Ave 41.98039 -87.71611 8967.694 8970.700
450 Warren Park West 42.00201 -87.68973 8970.580 8970.580
451 Sheridan Rd & Loyola Ave 42.00104 -87.66120 8966.526 8970.579
325 Clark St & Winnemac Ave 41.97339 -87.66836 8964.613 8970.512

Mapping

divvy_from <- dbGetQuery(conn, "SELECT t.FROM_STATION_NAME, 
                                       t.FROM_LATITUDE, 
                                       t.FROM_LONGITUDE,
                                       SUM(t.TRIP_DURATION) AS Total_Duration,
                                       MIN(t.TRIP_DURATION) AS Min_Duration,
                                       AVG(t.TRIP_DURATION) AS Avg_Duration,
                                       MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                       MAX(t.TRIP_DURATION) AS Max_Duration
                                       
                                FROM TRIPS t
                                GROUP BY t.FROM_STATION_NAME,
                                         t.FROM_LATITUDE, 
                                         t.FROM_LONGITUDE
                                ORDER BY SUM(t.TRIP_DURATION) DESC
                                FETCH FIRST 10 ROWS ONLY;")
knitr::kable(divvy_from)
FROM_STATION_NAME FROM_LATITUDE FROM_LONGITUDE TOTAL_DURATION MIN_DURATION AVG_DURATION MEDIAN_DURATION MAX_DURATION
Lake Shore Dr & Monroe St 41.88096 -87.61674 444455108 60 1803 1347 6403880
Streeter Dr & Grand Ave 41.89228 -87.61204 431638473 60 1956 1417 5444590
Michigan Ave & Oak St 41.90096 -87.62378 351053592 60 1815 1300 2549130
Millennium Park 41.88103 -87.62408 347331136 60 1820 1241 6785180
Theater on the Lake 41.92628 -87.63083 330660276 60 1524 1285 740561
Lake Shore Dr & North Blvd 41.91172 -87.62680 286215561 60 1452 1173 336578
Streeter Dr & Illinois St 41.89107 -87.61220 222454638 60 1615 1320 84073
Canal St & Adams St 41.87926 -87.63990 184661822 60 837 614 9961130
Michigan Ave & Washington St 41.88389 -87.62465 179427697 60 1160 649 1876990
Shedd Aquarium 41.86722 -87.61535 178996820 60 1675 1307 1389890
ggplot(transform(divvy_from, FROM_STATION_NAME=gsub("&", "\n&", FROM_STATION_NAME)),
                 aes(FROM_STATION_NAME, TOTAL_DURATION, fill=FROM_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Origination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

divvy_to <- dbGetQuery(conn, "SELECT t.TO_STATION_NAME, 
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE,
                                     SUM(t.TRIP_DURATION) AS Total_Duration,
                                     MIN(t.TRIP_DURATION) AS Min_Duration,
                                     AVG(t.TRIP_DURATION) AS Avg_Duration,
                                     MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                     MAX(t.TRIP_DURATION) AS Max_Duration
                                   
                            FROM TRIPS t
                            WHERE TO_STATION_NAME <> 'DIVVY Map Frame B/C Station'
                            GROUP BY t.TO_STATION_NAME,
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE
                            ORDER BY SUM(t.TRIP_DURATION) DESC
                            FETCH FIRST 10 ROWS ONLY;")
knitr::kable(divvy_to)
TO_STATION_NAME TO_LATITUDE TO_LONGITUDE TOTAL_DURATION MIN_DURATION AVG_DURATION MEDIAN_DURATION MAX_DURATION
Streeter Dr & Grand Ave 41.89228 -87.61204 469992965 60 1914 1432 297905
Lake Shore Dr & Monroe St 41.88096 -87.61674 396754905 60 1702 1318 731920
Theater on the Lake 41.92628 -87.63083 370722559 60 1598 1374 383963
Michigan Ave & Oak St 41.90096 -87.62378 369016876 60 1782 1306 2549130
Millennium Park 41.88103 -87.62408 344723608 60 1637 1157 566401
Lake Shore Dr & North Blvd 41.91172 -87.62680 337381528 60 1524 1295 269506
Streeter Dr & Illinois St 41.89107 -87.61220 264397622 60 1611 1325 85368
Michigan Ave & Washington St 41.88389 -87.62465 158901379 60 1003 572 3054250
Shedd Aquarium 41.86722 -87.61535 158356779 60 1563 1307 184015
Adler Planetarium 41.86610 -87.60727 153361126 60 1635 1357 524315
ggplot(transform(divvy_to, TO_STATION_NAME=gsub("&", "\n&", TO_STATION_NAME)), 
       aes(TO_STATION_NAME, TOTAL_DURATION, fill=TO_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Destination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

library(jsonlite)

x <- toJSON(divvy_from[c("FROM_STATION_NAME", "FROM_LATITUDE", "FROM_LONGITUDE")], pretty=TRUE)

# EXPORT TO FILE
fileConn <- file("Divvy_From_Coords.json")
writeLines(x, fileConn)
close(fileConn)

x
## [
##   {
##     "FROM_STATION_NAME": "Lake Shore Dr & Monroe St",
##     "FROM_LATITUDE": 41.881,
##     "FROM_LONGITUDE": -87.6167
##   },
##   {
##     "FROM_STATION_NAME": "Streeter Dr & Grand Ave",
##     "FROM_LATITUDE": 41.8923,
##     "FROM_LONGITUDE": -87.612
##   },
##   {
##     "FROM_STATION_NAME": "Michigan Ave & Oak St",
##     "FROM_LATITUDE": 41.901,
##     "FROM_LONGITUDE": -87.6238
##   },
##   {
##     "FROM_STATION_NAME": "Millennium Park",
##     "FROM_LATITUDE": 41.881,
##     "FROM_LONGITUDE": -87.6241
##   },
##   {
##     "FROM_STATION_NAME": "Theater on the Lake",
##     "FROM_LATITUDE": 41.9263,
##     "FROM_LONGITUDE": -87.6308
##   },
##   {
##     "FROM_STATION_NAME": "Lake Shore Dr & North Blvd",
##     "FROM_LATITUDE": 41.9117,
##     "FROM_LONGITUDE": -87.6268
##   },
##   {
##     "FROM_STATION_NAME": "Streeter Dr & Illinois St",
##     "FROM_LATITUDE": 41.8911,
##     "FROM_LONGITUDE": -87.6122
##   },
##   {
##     "FROM_STATION_NAME": "Canal St & Adams St",
##     "FROM_LATITUDE": 41.8793,
##     "FROM_LONGITUDE": -87.6399
##   },
##   {
##     "FROM_STATION_NAME": "Michigan Ave & Washington St",
##     "FROM_LATITUDE": 41.8839,
##     "FROM_LONGITUDE": -87.6246
##   },
##   {
##     "FROM_STATION_NAME": "Shedd Aquarium",
##     "FROM_LATITUDE": 41.8672,
##     "FROM_LONGITUDE": -87.6154
##   }
## ]
x <- toJSON(divvy_to[c("TO_STATION_NAME", "TO_LATITUDE", "TO_LONGITUDE")], pretty=TRUE)

# EXPORT TO FILE
fileConn <- file("Divvy_To_Coords.json")
writeLines(x, fileConn)
close(fileConn)

x
## [
##   {
##     "TO_STATION_NAME": "Streeter Dr & Grand Ave",
##     "TO_LATITUDE": 41.8923,
##     "TO_LONGITUDE": -87.612
##   },
##   {
##     "TO_STATION_NAME": "Lake Shore Dr & Monroe St",
##     "TO_LATITUDE": 41.881,
##     "TO_LONGITUDE": -87.6167
##   },
##   {
##     "TO_STATION_NAME": "Theater on the Lake",
##     "TO_LATITUDE": 41.9263,
##     "TO_LONGITUDE": -87.6308
##   },
##   {
##     "TO_STATION_NAME": "Michigan Ave & Oak St",
##     "TO_LATITUDE": 41.901,
##     "TO_LONGITUDE": -87.6238
##   },
##   {
##     "TO_STATION_NAME": "Millennium Park",
##     "TO_LATITUDE": 41.881,
##     "TO_LONGITUDE": -87.6241
##   },
##   {
##     "TO_STATION_NAME": "Lake Shore Dr & North Blvd",
##     "TO_LATITUDE": 41.9117,
##     "TO_LONGITUDE": -87.6268
##   },
##   {
##     "TO_STATION_NAME": "Streeter Dr & Illinois St",
##     "TO_LATITUDE": 41.8911,
##     "TO_LONGITUDE": -87.6122
##   },
##   {
##     "TO_STATION_NAME": "Michigan Ave & Washington St",
##     "TO_LATITUDE": 41.8839,
##     "TO_LONGITUDE": -87.6246
##   },
##   {
##     "TO_STATION_NAME": "Shedd Aquarium",
##     "TO_LATITUDE": 41.8672,
##     "TO_LONGITUDE": -87.6154
##   },
##   {
##     "TO_STATION_NAME": "Adler Planetarium",
##     "TO_LATITUDE": 41.8661,
##     "TO_LONGITUDE": -87.6073
##   }
## ]

# DISCONNECT FROM DATABASE
dbDisconnect(conn)


Metra Data Normalization and Testing

library(RSQLite)

conn <- dbConnect(RSQLite::SQLite(), dbname=config$sqlite_conn$database)

By Year

sql <- "SELECT l.Line, 
               t.Line AS Short,
               CAST(strftime('%Y', Report_Month) AS INT) as Year,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.Line,
                 t.Line,
                 strftime('%Y', t.Report_Month)"

agg_sql <- dbGetQuery(conn, sql)

knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
Line Short Year Total_Late_Trains Avg_Late_Trains
18 Heritage Corridor Heritage 2016 303 6.586957
27 Metra Electric Blue Island Elec-SC 2015 771 11.681818
6 BNSF Railway BNSF 2014 7955 110.486111
46 Metra Electric South Chicago Elec-BI 2014 663 11.050000
28 Metra Electric Blue Island Elec-SC 2016 722 10.027778
30 Metra Electric Blue Island Elec-SC 2018 565 7.847222
29 Metra Electric Blue Island Elec-SC 2017 729 10.125000
101 Union Pacific / North UP-N 2009 2868 39.833333
16 Heritage Corridor Heritage 2014 393 10.342105
83 Rock Island District RI 2011 3223 44.763889
32 Metra Electric Main Line Elec-ML 2010 2021 28.069444
36 Metra Electric Main Line Elec-ML 2014 2032 28.222222
129 Union Pacific / West UP-W 2017 2962 41.138889
39 Metra Electric Main Line Elec-ML 2017 1377 19.125000
75 North Central Service NCS 2013 1314 27.375000
47 Metra Electric South Chicago Elec-BI 2015 552 10.036364
87 Rock Island District RI 2015 1577 23.893939
66 Milwaukee District / West Milw-W 2014 3175 44.097222
19 Heritage Corridor Heritage 2017 363 7.562500
26 Metra Electric Blue Island Elec-SC 2014 973 13.513889
ggplot(agg_sql, aes(Year, Total_Late_Trains, color=Line)) + 
  geom_line(stat="identity") +
  labs(title="Metra Late Trains by Line", x="Year", y="Late Trains") +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_color_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Statistical Testing

T-Tests

lines_list <- split(agg_sql, agg_sql$Short)

nms <- lapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) paste(i, collapse="_"))

res <- t(sapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) {
  t <- t.test(lines_list[[i[1]]]$Total_Late_Trains, lines_list[[i[2]]]$Total_Late_Trains)
  c(statistic = t$statistic, p_value = t$p.value)
}))

row.names(res) <- nms

knitr::kable(res)
statistic.t p_value
BNSF_Heritage 9.8494756 0.0000034
BNSF_Elec-SC 8.7158417 0.0000076
BNSF_Elec-ML 6.1374028 0.0000957
BNSF_Elec-BI 9.3920454 0.0000054
BNSF_Milw-N 2.8910097 0.0111757
BNSF_Milw-W 4.9595192 0.0002727
BNSF_NCS 7.9789334 0.0000143
BNSF_RI 4.5415218 0.0006351
BNSF_SWS 7.5383993 0.0000207
BNSF_UP-N 4.9702928 0.0001637
BNSF_UP-NW 5.4107001 0.0002478
BNSF_UP-W 3.7401929 0.0021030
Heritage_Elec-SC -5.2621722 0.0000918
Heritage_Elec-ML -12.0629835 0.0000001
Heritage_Elec-BI -3.4987332 0.0027396
Heritage_Milw-N -10.3277589 0.0000017
Heritage_Milw-W -9.1513465 0.0000037
Heritage_NCS -7.7996898 0.0000020
Heritage_RI -11.0240760 0.0000006
Heritage_SWS -8.6861313 0.0000010
Heritage_UP-N -6.3532760 0.0001028
Heritage_UP-NW -13.6677426 0.0000000
Heritage_UP-W -9.5307864 0.0000033
Elec-SC_Elec-ML -7.6930971 0.0000013
Elec-SC_Elec-BI 3.0553272 0.0090725
Elec-SC_Milw-N -8.4524134 0.0000058
Elec-SC_Milw-W -6.7021299 0.0000291
Elec-SC_NCS -2.6856024 0.0153183
Elec-SC_RI -8.3005633 0.0000030
Elec-SC_SWS -3.9255453 0.0011167
Elec-SC_UP-N -4.5812945 0.0009268
Elec-SC_UP-NW -9.4434609 0.0000001
Elec-SC_UP-W -7.5259529 0.0000146
Elec-ML_Elec-BI 10.7051161 0.0000005
Elec-ML_Milw-N -4.3660738 0.0008571
Elec-ML_Milw-W -1.6151844 0.1279065
Elec-ML_NCS 5.2419772 0.0000733
Elec-ML_RI -2.7359672 0.0152246
Elec-ML_SWS 3.8518590 0.0012327
Elec-ML_UP-N -0.7261232 0.4813509
Elec-ML_UP-NW -1.8073859 0.0875269
Elec-ML_UP-W -3.2060881 0.0069770
Elec-BI_Milw-N -9.6182342 0.0000038
Elec-BI_Milw-W -8.2352811 0.0000119
Elec-BI_NCS -5.9428602 0.0000643
Elec-BI_RI -10.0649428 0.0000019
Elec-BI_SWS -7.0205384 0.0000176
Elec-BI_UP-N -5.6282342 0.0002825
Elec-BI_UP-NW -12.4136552 0.0000001
Elec-BI_UP-W -8.7676538 0.0000080
Milw-N_Milw-W 2.6700024 0.0162767
Milw-N_NCS 7.2630725 0.0000175
Milw-N_RI 2.0467572 0.0573187
Milw-N_SWS 6.5511394 0.0000355
Milw-N_UP-N 2.7933483 0.0120074
Milw-N_UP-NW 3.2584554 0.0063262
Milw-N_UP-W 1.0801680 0.2944043
Milw-W_NCS 5.1854922 0.0002195
Milw-W_RI -0.8225248 0.4216052
Milw-W_SWS 4.2973444 0.0008763
Milw-W_UP-N 0.4888439 0.6313060
Milw-W_UP-NW 0.2804970 0.7829139
Milw-W_UP-W -1.5653195 0.1356000
NCS_RI -6.6347574 0.0000189
NCS_SWS -1.3501689 0.1939515
NCS_UP-N -3.4515401 0.0055512
NCS_UP-NW -7.0452631 0.0000029
NCS_UP-W -6.2606259 0.0000586
RI_SWS 5.6476491 0.0000687
RI_UP-N 1.1936645 0.2499458
RI_UP-NW 1.3014342 0.2118233
RI_UP-W -0.8818209 0.3903462
SWS_UP-N -2.7957833 0.0169999
SWS_UP-NW -5.6470712 0.0000295
SWS_UP-W -5.5094534 0.0001466
UP-N_UP-NW -0.3290665 0.7474460
UP-N_UP-W -1.8187654 0.0857421
UP-NW_UP-W -2.0429687 0.0613004

By Month

sql <- "SELECT l.ID,
               l.Line, 
               t.Line AS Short,
               CAST(strftime('%m', Report_Month) AS INT) as Month_Num,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.ID,
                 l.Line,
                 t.Line,
                 CAST(strftime('%m', Report_Month) AS INT)
        ORDER BY l.ID,
                 CAST(strftime('%m', Report_Month) AS INT)"

agg_sql <- transform(dbGetQuery(conn, sql), Month = factor(month.abb[Month_Num], levels=month.abb))

knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
ID Line Short Month_Num Total_Late_Trains Avg_Late_Trains Month
68 6 Milwaukee District / North Milw-N 8 2797 46.61667 Aug
96 8 North Central Service NCS 12 1059 29.41667 Dec
146 13 Union Pacific / West UP-W 2 2500 41.66667 Feb
123 11 Union Pacific / North UP-N 3 1342 22.36667 Mar
117 10 SouthWest Service SWS 9 924 18.48000 Sep
132 11 Union Pacific / North UP-N 12 1466 27.14815 Dec
7 1 BNSF Railway BNSF 7 4314 71.90000 Jul
119 10 SouthWest Service SWS 11 1204 24.08000 Nov
73 7 Milwaukee District / West Milw-W 1 2347 39.11667 Jan
55 5 Metra Electric Blue Island Elec-SC 7 1204 20.06667 Jul
92 8 North Central Service NCS 8 945 23.62500 Aug
69 6 Milwaukee District / North Milw-N 9 2249 37.48333 Sep
1 1 BNSF Railway BNSF 1 4690 78.16667 Jan
154 13 Union Pacific / West UP-W 10 2212 36.86667 Oct
74 7 Milwaukee District / West Milw-W 2 2735 45.58333 Feb
59 5 Metra Electric Blue Island Elec-SC 11 774 12.90000 Nov
66 6 Milwaukee District / North Milw-N 6 3329 55.48333 Jun
62 6 Milwaukee District / North Milw-N 2 3327 55.45000 Feb
27 3 Metra Electric Main Line Elec-ML 3 1202 20.03333 Mar
103 9 Rock Island District RI 7 2929 48.81667 Jul
ggplot(agg_sql, aes(Month, Total_Late_Trains, fill=Line)) + 
  geom_col(position = "dodge") +
  labs(title="Metra Late Trains by Month", x="Year", y="Late Trains") +
  scale_x_discrete("Month", breaks=unique(agg_sql$Month)) +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  facet_wrap(~Line, ncol=3, scales="free_y") +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

Correlations

sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               SUM(CASE WHEN c.Late_Cause = 'Accident' THEN c.Late_Trains ELSE NULL END) AS [Accident],
               SUM(CASE WHEN c.Late_Cause = 'Catenary Failure' THEN c.Late_Trains ELSE NULL END) AS [Catenary],               
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Peak],
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Off-Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Off Peak],  
               SUM(CASE WHEN c.Late_Cause = 'Human Error' THEN c.Late_Trains ELSE NULL END) AS [Human Error],                     
               SUM(CASE WHEN c.Late_Cause = 'Lift Deployment' THEN c.Late_Trains ELSE NULL END) AS [Lift Deployment],                
               SUM(CASE WHEN c.Late_Cause = 'Locomotive Failure' THEN c.Late_Trains ELSE NULL END) AS [Locomotive Failure],              
               SUM(CASE WHEN c.Late_Cause = 'Non-Locomotive Equipment Failure' THEN c.Late_Trains ELSE NULL END) AS [Non-Locomotive Equipment Failure], 
               SUM(CASE WHEN c.Late_Cause = 'Obstruction/Debris' THEN c.Late_Trains ELSE NULL END) AS [Obstruction_Debris],              
               SUM(CASE WHEN c.Late_Cause = 'Other' THEN c.Late_Trains ELSE NULL END) AS [Other],                            
               SUM(CASE WHEN c.Late_Cause = 'Passenger Loading' THEN c.Late_Trains ELSE NULL END) AS [Passenger Loading],
               SUM(CASE WHEN c.Late_Cause = 'Passenger Train Interference' THEN c.Late_Trains ELSE NULL END) AS [Passenger Train Interference],     
               SUM(CASE WHEN c.Late_Cause = 'Sick, Injured, Unruly Passenger' THEN c.Late_Trains ELSE NULL END) AS [Sick Injured Unruly Passenger],
               SUM(CASE WHEN c.Late_Cause = 'Signal/Switch Failure' THEN c.Late_Trains ELSE NULL END) AS [Signal Switch Failure],           
               SUM(CASE WHEN c.Late_Cause = 'Track Work' THEN c.Late_Trains ELSE NULL END) AS [Track Work],                      
               SUM(CASE WHEN c.Late_Cause = 'Weather' THEN c.Late_Trains ELSE NULL END) AS [Weather]            FROM Causes c 
      GROUP BY CAST(strftime('%Y', c.Report_Month) AS INT);"

wide_sql <- dbGetQuery(conn, sql)

knitr::kable(wide_sql)
Year Accident Catenary Freight Interference Peak Freight Interference Off Peak Human Error Lift Deployment Locomotive Failure Non-Locomotive Equipment Failure Obstruction_Debris Other Passenger Loading Passenger Train Interference Sick Injured Unruly Passenger Signal Switch Failure Track Work Weather
2009 522 116 688 1040 1058 510 1202 402 798 538 2736 608 788 2798 1616 2150
2010 1600 196 1754 3054 2212 1082 2476 1144 1468 1158 4130 1484 1670 5290 2746 2946
2011 1338 80 990 2272 1740 902 1320 486 802 792 4290 988 1000 3296 2758 3094
2012 932 162 496 1474 1294 500 1086 326 848 638 2364 440 874 2506 1806 1262
2013 1150 338 642 1706 1508 410 1202 468 782 526 2276 404 768 3274 1328 2194
2014 1370 144 1336 2334 1470 428 1658 828 1100 568 1509 490 732 2536 1964 3322
2015 898 302 726 1360 1318 330 NA NA 870 488 1084 260 574 1770 1188 1950
2016 1088 200 618 1066 1270 290 NA NA 892 484 1082 308 814 2782 2000 1108
2017 1268 66 718 1294 1796 496 NA NA 1090 640 1162 266 712 2446 1896 1194
2018 814 154 1112 1916 1990 640 NA NA 1230 562 1324 444 776 3478 1570 2108
knitr::kable(cor(wide_sql[-1], use = "complete.obs", method="pearson"))
Accident Catenary Freight Interference Peak Freight Interference Off Peak Human Error Lift Deployment Locomotive Failure Non-Locomotive Equipment Failure Obstruction_Debris Other Passenger Loading Passenger Train Interference Sick Injured Unruly Passenger Signal Switch Failure Track Work Weather
Accident 1.0000000 0.1435415 0.7806773 0.9608999 0.8976619 0.5824678 0.7215697 0.7642611 0.6828030 0.6668736 0.3463388 0.5872245 0.5954141 0.6114195 0.6652174 0.6677474
Catenary 0.1435415 1.0000000 -0.1255482 0.0037574 0.1328661 -0.2913271 0.0296204 0.0639429 0.0278461 -0.1184067 -0.2985693 -0.2269049 -0.0105223 0.2158549 -0.5200273 -0.2466628
Freight Interference Peak 0.7806773 -0.1255482 1.0000000 0.9091988 0.8020245 0.6661686 0.9547198 0.9741564 0.9170038 0.7587071 0.3725510 0.7711136 0.7310978 0.7209778 0.7028399 0.7798693
Freight Interference Off Peak 0.9608999 0.0037574 0.9091988 1.0000000 0.9399653 0.7224539 0.8670727 0.8811132 0.8227255 0.8051196 0.4622986 0.7564155 0.7459639 0.7342568 0.7756815 0.7157045
Human Error 0.8976619 0.1328661 0.8020245 0.9399653 1.0000000 0.8356280 0.8342163 0.7914287 0.7573302 0.8979099 0.6539403 0.8494920 0.8652815 0.8833295 0.7699593 0.5505740
Lift Deployment 0.5824678 -0.2913271 0.6661686 0.7224539 0.8356280 1.0000000 0.7013289 0.5799427 0.5998879 0.9445498 0.9267498 0.9752919 0.9003905 0.8317744 0.9168026 0.4295133
Locomotive Failure 0.7215697 0.0296204 0.9547198 0.8670727 0.8342163 0.7013289 1.0000000 0.9781492 0.9757232 0.8446837 0.4127356 0.8198952 0.8583588 0.8499444 0.6355407 0.5798980
Non-Locomotive Equipment Failure 0.7642611 0.0639429 0.9741564 0.8811132 0.7914287 0.5799427 0.9781492 1.0000000 0.9622677 0.7347822 0.2636146 0.7106868 0.7395528 0.7528201 0.5662177 0.6705569
Obstruction_Debris 0.6828030 0.0278461 0.9170038 0.8227255 0.7573302 0.5998879 0.9757232 0.9622677 1.0000000 0.7937764 0.2714363 0.7237340 0.8117931 0.7594222 0.5631391 0.4725877
Other 0.6668736 -0.1184067 0.7587071 0.8051196 0.8979099 0.9445498 0.8446837 0.7347822 0.7937764 1.0000000 0.7778087 0.9605269 0.9844827 0.9054438 0.8402201 0.3593215
Passenger Loading 0.3463388 -0.2985693 0.3725510 0.4622986 0.6539403 0.9267498 0.4127356 0.2636146 0.2714363 0.7778087 1.0000000 0.8556134 0.7359260 0.7011243 0.7977969 0.2684835
Passenger Train Interference 0.5872245 -0.2269049 0.7711136 0.7564155 0.8494920 0.9752919 0.8198952 0.7106868 0.7237340 0.9605269 0.8556134 1.0000000 0.9415782 0.8986873 0.8649865 0.4857091
Sick Injured Unruly Passenger 0.5954141 -0.0105223 0.7310978 0.7459639 0.8652815 0.9003905 0.8583588 0.7395528 0.8117931 0.9844827 0.7359260 0.9415782 1.0000000 0.9448665 0.7375805 0.2765033
Signal Switch Failure 0.6114195 0.2158549 0.7209778 0.7342568 0.8833295 0.8317744 0.8499444 0.7528201 0.7594222 0.9054438 0.7011243 0.8986873 0.9448665 1.0000000 0.6072113 0.3542022
Track Work 0.6652174 -0.5200273 0.7028399 0.7756815 0.7699593 0.9168026 0.6355407 0.5662177 0.5631391 0.8402201 0.7977969 0.8649865 0.7375805 0.6072113 1.0000000 0.5830179
Weather 0.6677474 -0.2466628 0.7798693 0.7157045 0.5505740 0.4295133 0.5798980 0.6705569 0.4725877 0.3593215 0.2684835 0.4857091 0.2765033 0.3542022 0.5830179 1.0000000
sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               c.Late_Cause,
               SUM(c.Late_Trains) AS Total_Late_Trains
        FROM Causes c
        INNER JOIN Lines l ON c.LineID = l.ID
        WHERE c.Late_Cause IN 
              (SELECT DISTINCT sub_c.Late_Cause 
               FROM Causes AS sub_c 
               WHERE strftime('%Y', sub_c.Report_Month) = '2012')
          AND c.Late_Cause <> 'TOTAL TRAINS DELAYED'
        GROUP BY strftime('%Y', c.Report_Month),
                 c.Late_Cause"

agg_sql <- dbGetQuery(conn, sql)

knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),])
Year Late_Cause Total_Late_Trains
138 2017 Human Error 1796
51 2011 Weather 3094
24 2010 Lift Deployment 1082
152 2018 Freight Interference - Total 3028
154 2018 Lift Deployment 640
71 2013 Freight Interference - Off-Peak 1706
4 2009 Freight Interference - Peak 688
97 2014 Passenger Loading 1509
139 2017 Lift Deployment 496
28 2010 Other 1158
81 2013 Passenger Train Interference 404
104 2015 Catenary Failure 302
110 2015 Obstruction/Debris 870
2 2009 Catenary Failure 116
68 2012 Weather 1262
126 2016 Other 484
107 2015 Freight Interference - Total 2086
31 2010 Sick, Injured, Unruly Passenger 1670
18 2010 Accident 1600
53 2012 Catenary Failure 162
ggplot(subset(agg_sql, Late_Cause != 'Freight Interference - Total'),
              aes(Year, Total_Late_Trains, fill=Late_Cause)) + 
  geom_bar(position = "fill", stat = "identity") +
  labs(title="Metra Late Trains by Cause", x="Year", y="Late Trains") +
  guides(fill=guide_legend(ncol=4)) +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

# DISCONNECT FROM DATABASE
dbDisconnect(conn)


Conclusion: Relational Databases in R